
**********
* Readme *
**********

* This script estimates 
* [1] duration models (for employment and unemployment) on PNAD data with our max likelihood method and 
* [2] estimates the expected transition hazards, conditional on individual's covariates using the POF data.


* Root folder (PATH TO BE DEFINED BY THE USER)
**********************************************
clear all
global analysis "C:/***/replication_package"


* Timestamped log
*****************
global today = strofreal(date(c(current_date), "DMY"), "%tdYYNNDD")
log using "${analysis}/code/logs/4_3_estimate_duration_models_${today}.smcl", replace


***********************
* Parallel parameters *
***********************

* How many processor do I have?
parallel numprocessors
global myprocessors = r(numprocessors) - 1

* Set number of child processes
parallel initialize $myprocessors


************************
* Bootstrap parameters *
************************

* How many bootstrap replications?
global B = 400


*******************************
* Import our general ML model *
*******************************

program drop _all
do "${analysis}/code/prog_duration_d1.do"

/* This ML d1 allows for 
- monotonic hazard (constant or not) 
- any type of censoring or truncation 
- optional 2-types mixture heterogeneity

NOTES: 
- t_0, t_a and t_b can have different names, but they need to appear in this specific order.
- Add the "missing" option to make sure the program won't ignore the cases with right censoring (ie those where t_b is missing) */

* Constraint for the exponential case
constraint 1 [shape]_cons = 1

* Force estimation without het terms
constraint 2 [ln_odds]_cons = -750
constraint 3 [ln_u]_cons = -750


***********************
* EMPLOYMENT duration *
***********************

* Import data
use "${analysis}/data/2_5_pnad_clean_duration_e.dta", clear
codebook ind_id


****************************************************************************
* Define covariates (same set for both employment and unemployment models) *
****************************************************************************

* Define set of covariates
unab covariates_all : rg_* ae_* at_* fp_* n_kids n_youngs n_adults n_seniors region_* // all
unab ref_groups : rg_1_nonwhite_female ae_1_1 fp_h_wp_nk region_SP_c // reference group: non white female, no education, head w partner no kids, Sao Paulo
global covariates : list covariates_all - ref_groups // final list of regressors


****************************************************************************************
* Starting point: Standard exponential model (no interval censoring, no heterogeneity) *
****************************************************************************************

* Set a transition point half-way between ta_e and tb_e
gen midpoint_e = (ta_e + tb_e)/2
replace midpoint_e = ta_e if missing(tb_e)

* Declare survival-time data with a single-failure per subject; reference interval unit is month
stset midpoint_e [fweight = fweight], failure(fail_e == 1) enter(t0_e) 

* Standard exponential model
eststo streg_e, title("Streg exponential"): streg $covariates, distribution(exponential) nohr
estimates save "${analysis}/results/estimations/4_3_mod_streg_e.ster", replace

* Store coefficients to use as initial values in the next step
matrix base_coeff_e = e(b)
matrix coleq base_coeff_e = scale // change the equation name from "_t" to "scale"


*****************************************************************************
* Constant hazards, all types of censoring and truncation, no heterogeneity *
*****************************************************************************

ml model d1 duration (scale: t0_e ta_e tb_e = $covariates) (shape:) (ln_u:) (ln_odds:) [fweight = fweight], ///
  missing constraint(1 2 3) technique(dfp 10 bfgs 10) ///
  diparm(ln_u, exp label("heterog") prob) ///
  diparm(ln_odds, invlogit label("share") prob)

ml init base_coeff_e
eststo ml_exp_e, title("Exponential"): ml maximize, difficult search(off)
estimates save "${analysis}/results/estimations/4_3_mod_ml_exp_e.ster", replace

* Store coefficients to use as initial values in the next step
matrix base_coeff_e = e(b)
matrix base_coeff_e[1, colnumb(base_coeff_e, "ln_u:_cons")] = 0    // initial value for high type baseline gap
matrix base_coeff_e[1, colnumb(base_coeff_e, "ln_odds:_cons")] = 0 // initial value for share of high type


***************************************************************************************
* Constant hazards, all types of censoring and truncation, 2-types mixture (BASELINE) *
***************************************************************************************

ml model d1 duration (scale: t0_e ta_e tb_e = $covariates) (shape:) (ln_u:) (ln_odds:) [fweight = fweight], ///
  missing constraint(1) technique(dfp 10 bfgs 10) ///
  diparm(ln_u, exp label("heterog") prob) ///
  diparm(ln_odds, invlogit label("share") prob)

ml init base_coeff_e
eststo ml_exp_mix_e, title("Exponential mixture"): ml maximize, difficult search(off)
estimates save "${analysis}/results/estimations/4_3_mod_ml_exp_mix_e.ster", replace


********************************************************
* Program to bootstrap the sample and store each model *
********************************************************

* Folders for temp files
cap mkdir "${analysis}/results/estimations/temp/"
cap mkdir "${analysis}/results/estimations/temp/e/"
cap mkdir "${analysis}/results/estimations/temp/u/"

program duration_model_e, eclass

  constraint 1 [shape]_cons = 1

  * Recover good initial values
  estimates use "${analysis}/results/estimations/4_3_mod_ml_exp_mix_e.ster"
  matrix base_coeff_e = e(b)

  * Estimation of interest: constant hazards, all types of censoring and truncation, 2-types mixture
  ml model d1 duration (scale: t0_e ta_e tb_e = $covariates) (shape:) (ln_u:) (ln_odds:) [fweight = fweight], missing constraint(1) technique(dfp 10 bfgs 10) diparm(ln_u, exp label("heterog") prob) diparm(ln_odds, invlogit label("share") prob)
  ml init base_coeff_e
  capture ml maximize, difficult search(off)

  * Store duration models
  local how_many_files : dir "${analysis}/results/estimations/temp/e/" files "*.ster"
  local next_file : word count `how_many_files'
  capture estimates save "${analysis}/results/estimations/temp/e/`next_file'.ster", replace
 
end


*********************************************
* Run the program with clustered resampling *
*********************************************

* Remove previous estimations, if any
local files_to_delete : dir "${analysis}/results/estimations/temp/e/" files "*.ster"
foreach file of local files_to_delete {
  erase "${analysis}/results/estimations/temp/e/`file'"
}

parallel bootstrap, strata(region) cluster(psu_id) reps($B) programs(duration): duration_model_e
estimates save "${analysis}/results/estimations/4_3_mod_ml_exp_mix_e_boot.ster", replace


*************************
* UNEMPLOYMENT duration *
*************************

* Import data
use "${analysis}/data/2_5_pnad_clean_duration_u.dta", clear
codebook ind_id


****************************************************************************************
* Starting point: Standard exponential model (no interval censoring, no heterogeneity) *
****************************************************************************************

* Set a transition point half way between ta_u and tb_u
gen midpoint_u = (ta_u + tb_u)/2
replace midpoint_u = ta_u if missing(tb_u)

* Declaring survival-time data with a single-failure per subjetc; reference interval unit is month
stset midpoint_u [fweight = fweight], failure(fail_u == 1) enter(t0_u) 

* Standard exponential model
eststo streg_u, title("Streg Exponential"): streg $covariates, distribution(exponential) nohr
estimates save "${analysis}/results/estimations/4_3_mod_streg_u.ster", replace

* Store coefficients to use as initial values in the next step
matrix base_coeff_u = e(b)
matrix coleq base_coeff_u = scale // change the equation name from "_t" to "scale"


*****************************************************************************
* Constant hazards, all types of censoring and truncation, no heterogeneity *
*****************************************************************************

ml model d1 duration (scale: t0_u ta_u tb_u = $covariates) (shape:) (ln_u:) (ln_odds:) [fweight = fweight], ///
  missing constraint(1 2 3) technique(dfp 10 bfgs 10) ///
  diparm(ln_u, exp label("heterog") prob) ///
  diparm(ln_odds, invlogit label("share") prob)

ml init base_coeff_u
eststo ml_exp_u, title("Exponential"): ml maximize, difficult search(off)
estimates save "${analysis}/results/estimations/4_3_mod_ml_exp_u.ster", replace

* Store coefficients to use as initial values in the next step
matrix base_coeff_u = e(b)
matrix base_coeff_u[1, colnumb(base_coeff_u, "ln_u:_cons")] = 0    // initial value for high type baseline gap
matrix base_coeff_u[1, colnumb(base_coeff_u, "ln_odds:_cons")] = 0 // initial value for share of high type


***************************************************************************************
* Constant hazards, all types of censoring and truncation, 2-types mixture (BASELINE) *
***************************************************************************************

ml model d1 duration (scale: t0_u ta_u tb_u = $covariates) (shape:) (ln_u:) (ln_odds:) [fweight = fweight], ///
  missing constraint(1) technique(dfp 10 bfgs 10) ///
  diparm(ln_u, exp label("heterog") prob) ///
  diparm(ln_odds, invlogit label("share") prob)

ml init base_coeff_u
eststo ml_exp_mix_u, title("Exponential mixture"): ml maximize, difficult search(off)
estimates save "${analysis}/results/estimations/4_3_mod_ml_exp_mix_u.ster", replace


********************************************************
* Program to bootstrap the sample and store each model *
********************************************************

program duration_model_u, eclass

  constraint 1 [shape]_cons = 1

  * Recover good initial values
  estimates use "${analysis}/results/estimations/4_3_mod_ml_exp_mix_u.ster"
  matrix base_coeff_u = e(b)

  * Estimation of interest: constant hazards, all types of censoring and truncation, 2-types mixture
  ml model d1 duration (scale: t0_u ta_u tb_u = $covariates) (shape:) (ln_u:) (ln_odds:) [fweight = fweight], missing constraint(1) technique(dfp 10 bfgs 10) diparm(ln_u, exp label("heterog") prob) diparm(ln_odds, invlogit label("share") prob)
  ml init base_coeff_u
  capture ml maximize, difficult search(off)

  * Store duration models
  local how_many_files : dir "${analysis}/results/estimations/temp/u/" files "*.ster"
  local next_file : word count `how_many_files'
  capture estimates save "${analysis}/results/estimations/temp/u/`next_file'.ster", replace
 
end


*********************************************
* Run the program with clustered resampling *
*********************************************

local files_to_delete : dir "${analysis}/results/estimations/temp/u/" files "*.ster"
foreach file of local files_to_delete {
  erase "${analysis}/results/estimations/temp/u/`file'"
}

parallel bootstrap, strata(region) cluster(psu_id) reps($B) programs(duration): duration_model_u
estimates save "${analysis}/results/estimations/4_3_mod_ml_exp_mix_u_boot.ster", replace


***************************************************************************
* Predict out-of-employment and from-unemployment-into-employment hazards *
***************************************************************************

* In the exponential case, the hazard is simply exp(xb()) for any t 
* We weight the hazard by the share of the components in the mixture.

* Import POF
use "${analysis}/data/2_2_pof_clean.dta", clear
svyset psu_id [pweight = pweight], strata(strata_id) singleunit(centered) vce(linearized) 
keep if work_state_name == "Own-account worker"


* FULL SAMPLE: Out-of-employment hazards
****************************************

estimates use "${analysis}/results/estimations/4_3_mod_ml_exp_mix_e.ster"
predictnl haz_out_of_e_0 = (1 - invlogit(_b[ln_odds:_cons])) * exp(xb()) + invlogit(_b[ln_odds:_cons]) * exp(xb() + exp(_b[ln_u:_cons]))
svy: mean haz_out_of_e_0


* FULL SAMPLE: From-unemployment-into-employment hazards
********************************************************

estimates use "${analysis}/results/estimations/4_3_mod_ml_exp_mix_u.ster"
predictnl haz_from_u_to_e_0 = (1 - invlogit(_b[ln_odds:_cons])) * exp(xb()) + invlogit(_b[ln_odds:_cons]) * exp(xb() + exp(_b[ln_u:_cons]))
svy: mean haz_from_u_to_e_0


* BOOTSTRAPPED MODELS: Out-of-employment hazards
************************************************

local models_to_fit_e : dir "${analysis}/results/estimations/temp/e/" files "*.ster"

local i = 1
foreach file of local models_to_fit_e {
  display `i'
  estimates use "${analysis}/results/estimations/temp/e/`file'"
  predictnl haz_out_of_e_`i' = (1 - invlogit(_b[ln_odds:_cons])) * exp(xb()) + invlogit(_b[ln_odds:_cons]) * exp(xb() + exp(_b[ln_u:_cons]))
  local i = `i' + 1
}


* BOOTSTRAPPED MODELS: From-unemployment-into-employment hazards
****************************************************************

local models_to_fit_u : dir "${analysis}/results/estimations/temp/u/" files "*.ster"

local i = 1
foreach file of local models_to_fit_u {
  display `i'
  estimates use "${analysis}/results/estimations/temp/u/`file'"
  predictnl haz_from_u_to_e_`i' = (1 - invlogit(_b[ln_odds:_cons])) * exp(xb()) + invlogit(_b[ln_odds:_cons]) * exp(xb() + exp(_b[ln_u:_cons]))
  local i = `i' + 1
}


* Keep only the fitted values
*****************************

keep  ind_id haz_out_of_e_* haz_from_u_to_e_*
order ind_id haz_out_of_e_* haz_from_u_to_e_*
sort  ind_id
describe
save "${analysis}/data/4_3_est_hazards.dta", replace


***************************
* Estimation output table *
***************************

* Out of employment hazard
use "${analysis}/data/2_5_pnad_clean_duration_e.dta", clear
estimates use "${analysis}/results/estimations/4_3_mod_ml_exp_mix_e_boot.ster"
estimates store ml_exp_mix_e

* Ancillary results
eststo anc_ml_exp_mix_e: nlcom ("shape": _b[shape:_cons]) ("heterog": exp(exp(_b[ln_u:_cons]))) ("share": invlogit(_b[ln_odds:_cons])), post 
matrix define b = e(b)
matrix define V = e(V)

estimates restore ml_exp_mix_e
estadd scalar ws_b b[1, colnumb(b, "shape")]
estadd scalar u_b b[1, colnumb(b, "heterog")]
estadd scalar p_b b[1, colnumb(b, "share")]

estadd scalar ws_sd sqrt(V[colnumb(V, "shape"), colnumb(V, "shape")])
estadd scalar u_sd sqrt(V[colnumb(V, "heterog"), colnumb(V, "heterog")])
estadd scalar p_sd sqrt(V[colnumb(V, "share"), colnumb(V, "share")])

* From unemp into employment hazard
use "${analysis}/data/2_5_pnad_clean_duration_u.dta", clear
estimates use "${analysis}/results/estimations/4_3_mod_ml_exp_mix_u_boot.ster"
estimates store ml_exp_mix_u

* Ancillary results
eststo anc_ml_exp_mix_u: nlcom ("shape": _b[shape:_cons]) ("heterog": exp(exp(_b[ln_u:_cons]))) ("share": invlogit(_b[ln_odds:_cons])), post 
matrix define b = e(b)
matrix define V = e(V)

estimates restore ml_exp_mix_u
estadd scalar ws_b b[1, colnumb(b, "shape")]
estadd scalar u_b b[1, colnumb(b, "heterog")]
estadd scalar p_b b[1, colnumb(b, "share")]

estadd scalar ws_sd sqrt(V[colnumb(V, "shape"), colnumb(V, "shape")])
estadd scalar u_sd sqrt(V[colnumb(V, "heterog"), colnumb(V, "heterog")])
estadd scalar p_sd sqrt(V[colnumb(V, "share"), colnumb(V, "share")])

global region region_*

* Print
esttab ml_exp_mix_e ml_exp_mix_u, eform ///  
  nonumbers mgroups(none) mlabels(, titles) cells("b(fmt(3)) se(par fmt(3))") label collabel("hazard ratio" "s.e.") eqlabels(, none) ///
  drop(shape: ln_u: ln_odds: $region _cons) ///
  refcat( ///
    rg_2_white_female "Ethnicity and gender (ref: Nonwhite female)" ///
    ae_1_2 "Age and education (ref: 14-24, less than prim. school)" ///
    ae_2_1 "[BLANK LINE]" ///
    ae_3_1 "[BLANK LINE]" ///
    ae_4_1 "[BLANK LINE]" ///
    ae_5_1 "[BLANK LINE]" ///
    at_school "Current schooling status (ref: Not currently studying)" ///
    fp_h_wp_wk "Household position (ref: Head, with partner, no kids)" ///
    n_kids "Number of household members by age", nolabel) ///
  stats(u_b u_sd p_b p_sd, ///
  layout("@ (@)" "@ (@)") ///
  labels("Hazard ratio for high type" "Share of high type") ///
  fmt(3 3 3 3))

  
* TABLE 7
*********

esttab ml_exp_mix_e ml_exp_mix_u using "${analysis}/results/tables/table7.tex", eform style(tex) fragment replace ///  
  nonumbers mgroups(none) mlabels(none) cells("b(fmt(3)) se(par fmt(3))") label collabel(none) eqlabels(none) ///
  drop(shape: ln_u: ln_odds: $region _cons) ///
  refcat( ///
    rg_2_white_female   "[nospace]\textit{Ethnicity and gender} [emptycolumns] \\ \hspace{2ex}Nonwhite female" ///
    ae_1_2              "[nospace]\textit{Age and education} [emptycolumns] \\ \hspace{2ex}14-24, less than prim. school" ///
    at_school           "[nospace]\textit{Current schooling status} [emptycolumns] \\ \hspace{2ex}Not currently studying" ///
    fp_h_wp_wk          "[nospace]\textit{Household position} [emptycolumns] \\ \hspace{2ex}Head, with partner, no kids" ///
    n_kids              "[nospace]\textit{Number of household members by age}", nolabel) ///
  stats(u_b u_sd p_b p_sd, ///
  layout("@ (@)" "@ (@)") ///
  labels("\textit{Ancillary mixture parameters} [emptycolumns] \\ \hspace{2ex}Hazard ratio for high type" "\hspace{2ex}Share of high type") ///
  fmt(3 3 3 3)) ///
  varlabels(, /// 
    prefix(\hspace{2ex}) ///
    elist(rg_4_white_male \tabularnewline ae_1_4 \tabularnewline ae_2_4 \tabularnewline ae_3_4 \tabularnewline ae_4_4 \tabularnewline ae_5_4 \tabularnewline at_college \tabularnewline fp_oa \tabularnewline n_seniors \tabularnewline\midrule)) ///
  substitute("\hline" "" ///
             "&" " & " "  " " " "  " " " "  " " " "  " " " ///
             " & & " " & $\cdot$ & " ///
             " & & " " & $\cdot$ & " ///
             " & & " " & $\cdot$ & " ///
             " & \\" " & $\cdot$ \\" ///
             "\hspace{2ex}[nospace]" "" ///
             "[emptycolumns]" "& & & &" ///
             "\textit{Number of household members by age} & $\cdot$ & $\cdot$ & $\cdot$ & $\cdot$ \\" "\textit{Number of household members by age} & & & & \\")  

  
* End of script
***************
cap log close